Below is all relevant packages used to complete this project:
In [74]:
import pandas as pd
import matplotlib.pyplot as plt
from itertools import product
import seaborn as sns
import numpy as np
from wordcloud import WordCloud
import ast
import altair as alt
from tqdm import tqdm
import networkx as nx
import json
from networkx.readwrite import json_graph
import vl_convert as vlc
import random
import matplotlib.colors as mcolors
from netwulf import visualize, draw_netwulf
import community
from wordcloud import WordCloud
from joblib import Parallel, delayed
import os # This is imported in order to store API key as environment variable
from openai import OpenAI
import random
Table of Contents¶
1. Motivation ¶
- In the first part of the the data collection we utilize the python libraries BeautifulSoup and Selinium to scrape the Products and textual data from the BilkaToGO website.
- In the second part of the data collection we use the Salling Group API to get the products that are frequently bought together i.e. the neighbors of each product.
2.1 Data collection Webscraping ¶
In [34]:
##INSERT WEBSCRAPING CODE HERE
2.2 Preprocessing and Cleaning ¶
In this section the data is filtered and cleaned to prepare it to be used in the analysis. The preprocessing is handeled in 3 steps:
- Product data (ids, names, prices, categories)
- Textual data (product descriptions)
- Neighbors data (frequently bought together) The pandas library is used to read the csv files for the webscraped data:
In [35]:
df_products = pd.read_csv('data/df_Salling_Products.csv' , delimiter=";") # Read the csv file containing the products_id
df_text = pd.read_csv('data/df_Salling_Products_Descriptions_CLEANED.csv' , delimiter=";") # Read the csv file containing the descriptions
df_neighbours = pd.read_csv('data/df_Salling_Products_Neighbours.csv', delimiter=";") # Read the csv file containing the neighbours to a specific product_id
In [36]:
# Cleaning product id data by stripping unnecessary characters ect.
df_products[['price_amount', 'unit']] = df_products['price'].str.split('/', expand=True) # Cleaning price attribute
df_products['price_amount'] = df_products['price_amount'].str.replace(',', '.').str.extract('(\d+.\d+)').astype(float) # Cleaning price attribute
df_products['category'] = df_products['category'].str.replace('/','') # Cleaning category attribute
df_products.drop(columns=['Unnamed: 0'], inplace=True) # Drop the original price column
#Distribute the 21 different inner categories into 3 main categories: Foods, House and Other
foods = ['frugt-og-groent', 'koed-og-fisk', 'mejeri-og-koel', 'drikkevarer', 'broed-og-kager','kolonial', 'slik-og-snacks', 'frost']
house = ['husholdning', 'personlig-pleje', 'baby-og-boern', 'bolig-og-koekken', 'fritid-og-sport', 'toej-og-sko', 'have', 'leg', 'byggemarked']
other = ['dyremad','elektronik','biludstyr', 'kiosk']
df_products['outer_category'] = df_products['category'].map(lambda x: 'Foods' if x in foods else ('House' if x in house else 'Other'))
df_products.head()
Out[36]:
| product_id | name | price | link | category | price_amount | unit | outer_category | |
|---|---|---|---|---|---|---|---|---|
| 0 | 18381 | Bananer | 2,75/Stk. | /produkt/bananer/18381/ | frugt-og-groent | 2.75 | Stk. | Foods |
| 1 | 51061 | Peberfrugter røde | 7,25/Stk. | /produkt/peberfrugter-roede/51061/ | frugt-og-groent | 7.25 | Stk. | Foods |
| 2 | 61090 | Agurk øko | 9,00/Stk. | /produkt/salling-oeko-agurk-oeko/61090/ | frugt-og-groent | 9.00 | Stk. | Foods |
| 3 | 72008 | Bananer 4 pak øko | 2,20/Stk. | /produkt/bananer-4-pak-oeko/72008/ | frugt-og-groent | 2.20 | Stk. | Foods |
| 4 | 18323 | Gulerødder | 10,00/Kg. | /produkt/salling-guleroedder/18323/ | frugt-og-groent | 10.00 | Kg. | Foods |
Distribution of the categories of the webscraped products.
In [37]:
alt.data_transformers.disable_max_rows()
domain=['Foods', 'House', 'Other']
crange=['#2ca02c','#1f77b4', '#ff7f0e', ]
all_chart = alt.Chart(df_products).mark_bar().encode(
x='count():Q',
y=alt.Y('category:N', sort='-x', title='Inner Category'),
color=alt.Color('outer_category:N',scale=alt.Scale(domain=domain, range=crange),title='Outer Category'),
tooltip=['category', 'outer_category', 'count()']
).properties(
title='Distribution of the products in the different categories',
width=300,
height=300
)#.configure(background='transparent')
df_filtered = df_products[df_products['outer_category'] == 'Foods']
foods_chart = alt.Chart(df_filtered).mark_bar().encode(
x='count():Q',
y=alt.Y('category:N', sort='-x', title='Inner Categories'),
color=alt.Color('outer_category:N',scale=alt.Scale(domain=domain, range=crange),title='Outer Category'),
tooltip=['category', 'outer_category', 'count()']
).properties(
title='Distribution of the products in the Foods category only',
width=300,
height=300
)#.configure(background='transparent')
concated = (all_chart | foods_chart).interactive()#.configure(background='transparent')
concated
#concated.save('images/concated_chart_categories.png', scale_factor=2.0)
c:\Users\17kkj\miniconda3\envs\social\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``. col = df[col_name].apply(to_list_if_array, convert_dtype=False) c:\Users\17kkj\miniconda3\envs\social\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``. col = df[col_name].apply(to_list_if_array, convert_dtype=False) c:\Users\17kkj\miniconda3\envs\social\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``. col = df[col_name].apply(to_list_if_array, convert_dtype=False) c:\Users\17kkj\miniconda3\envs\social\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``. col = df[col_name].apply(to_list_if_array, convert_dtype=False)
Out[37]: